library(tidyverse)
library(maptools)
library(sp)
library(rgdal)
library(choroplethrMaps)
library(choroplethr)
library(ggridges)
library(forcats)
library(GGally)
library(gridExtra)
library(extracat)
library(plotly)
## Data
Demo<-read.csv("demographics.csv")
Health.Summary<-read.csv("summary measures of health.csv")
Risk<-read.csv("risk factors and access to care.csv")
Birth.Death<-read.csv("measures of birth and death.csv")
Health<-cbind(Demo[,c(1:6,9,12,15,30,33,36,39,42)], Health.Summary[,c(7,11,17,23)],Risk[,c(7,10,13,16,19,22)],Birth.Death[,c(85,91,97,109,121)])
## Identify NA
for (i in 7:29){
Health[,i]<-ifelse(Health[,i]<0,NA,Health[,i])
}
## Health by State
Health.State<-Health %>%
group_by(CHSI_State_Name) %>%
summarize(State_FIPS_Code=mean(State_FIPS_Code),
Population.Size=sum(Population_Size),
Poverty=sum(Population_Size*0.01*Poverty, na.rm=TRUE)/sum(Population_Size,na.rm = TRUE)*100,
Life_exp=mean(ALE, na.rm=TRUE),
All_Death=sum(All_Death, na.rm = TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Health_Status=sum(Population_Size*0.01*Health_Status,na.rm=TRUE)/sum(Population_Size)*100,
Unhealthy_Days=mean(Unhealthy_Days,na.rm=TRUE),
No_Exercise=sum(Population_Size*0.01*No_Exercise, na.rm=TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Few_Fruit_Veg=sum(Population_Size*0.01*Few_Fruit_Veg, na.rm=TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Obese=sum(Population_Size*0.01*Obesity, na.rm=TRUE)/sum(Population_Size, na.rm = TRUE)*100,
High_Blood_Pres=sum(Population_Size*0.01*High_Blood_Pres, na.rm=TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Smoker=sum(Population_Size*0.01*Smoker, na.rm=TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Diabetes=sum(Population_Size*0.01*Diabetes, na.rm=TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Brst_Cancer=sum(Brst_Cancer,na.rm = TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Col_Cancer=sum(Col_Cancer,na.rm = TRUE)/sum(Population_Size, na.rm = TRUE)*100,
CHD=sum(CHD,na.rm = TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Lung_Cancer=sum(Lung_Cancer,na.rm = TRUE)/sum(Population_Size, na.rm = TRUE)*100,
Stroke=sum(Stroke,na.rm = TRUE)/sum(Population_Size, na.rm = TRUE)*100)
Health.State[,c(4:19)]<-round(Health.State[,c(4:19)],2)
Health.State$CHSI_State_Abbr<-names(table(Health$CHSI_State_Abbr))
Health.State[Health.State$CHSI_State_Name=="Alaska",]<-ifelse(Health.State[Health.State$CHSI_State_Name=="Alaska",]==0, NA, Health.State[Health.State$CHSI_State_Name=="Alaska",])
write.csv(Health.State,"Health.State.csv")
visna(Health, sort = "b")
visna(Health.State, sort = "b")
## correlation grid
ggcorr(Health.State[,-c(1:2,20)], label = TRUE, label_round = 2, hjust = 0.75, size = 3, label_size = 2, title="Correlation Plot")+labs(title = "Correlation Plot")
tidydf<-gather(Health.State, key = "Diseases", value = "Percentage", colnames(Health.State%>%select(15:19)))
## Scatter plot (no exercise)
sc1<-ggplot(data = tidydf, mapping = aes(x = No_Exercise, y = Percentage)) +
geom_point(aes(color = Diseases),size=0.7,alpha=0.7)
## Scatter plot (few fruit and veg)
sc2<-ggplot(data = tidydf, mapping = aes(x = Few_Fruit_Veg, y = Percentage)) +
geom_point(aes(color = Diseases),size=0.7,alpha=0.7)
## Scatter plot (Obesity)
sc3<-ggplot(data = tidydf, mapping = aes(x = Obese, y = Percentage)) +
geom_point(aes(color = Diseases),size=0.7,alpha=0.7)
## Scatter plot (High blood pre)
sc4<-ggplot(data = tidydf, mapping = aes(x = High_Blood_Pres, y = Percentage)) +
geom_point(aes(color = Diseases),size=0.7,alpha=0.7)
## Scatter plot (Smoker)
sc5<-ggplot(data = tidydf, mapping = aes(x = Smoker, y = Percentage)) +
geom_point(aes(color = Diseases),size=0.7,alpha=0.7)
## Scatter plot (Diabetes)
sc6<-ggplot(data = tidydf, mapping = aes(x =Diabetes, y = Percentage)) +
geom_point(aes(color = Diseases),size=0.7,alpha=0.7)
grid.arrange(sc1, sc2,sc3,sc4,sc5,sc6, nrow = 3)
### boxplot
New<-gather(Health, key="Categories", value = "Percentage", colnames(Health%>%select(19:24)))
New<-na.omit(New[,c(24,25)])
ggplot(New) + geom_boxplot(aes(y=Percentage,x=reorder(Categories, Percentage, median)))+
ylab("Percentage")+xlab("Risk Factors")+ggtitle("Boxplot of Risk Factors")+
coord_flip()
## ridge
ggplot(New)+
geom_density_ridges(aes(x=New$Percentage, y=reorder(New$Categories, New$Percentage, median)))+xlab("Percentage")+ylab("Risk Factors")+ggtitle("Ridgeline Plot of Risk Factors")
## PCA biplot
HS.pc<- as.data.frame(Health.State[,c(9:14)])
rownames(HS.pc) <- Health.State$CHSI_State_Name
pr.out<-prcomp(na.omit(HS.pc),scale=TRUE)
biplot(pr.out, expand=1.3, cex=c(0.6,0.6), main="Biplot of risk factors (Scaled)", xlim=c(-0.4, 0.4),
ylim=c(-0.5,0.5), col=c("orange","blue"))
#average life expectancy ALE
countycode<-substr(paste("00",Health[,2],sep=''),nchar(paste("00",Health[,2],sep=''))-2,nchar(paste("00",Health[,2],sep='')))
code<-as.numeric(paste(Health[,1],countycode,sep=''))
county.ALE<-data.frame(region=as.numeric(paste(Health[,1],countycode,sep='')),value=Health[,15])
county_choropleth(county.ALE,
title = "County Data, Average Life Expectancy (ALE)",
legend = "ALE")
county.No_Exercise<-cbind.data.frame(region=code,value=Health[,19])
county_choropleth(county.No_Exercise,
title = "County Data, No_Exercise",
legend = "No_Exercise")
p1<-plot_ly(x=Health$ALE,
y=Health$No_Exercise
)
p1 %>% add_histogram2d(nbinsx=30, nbinsy=30)
## clevland
ggplot(tidydf, aes(Percentage, fct_reorder2(`CHSI_State_Name`, Diseases=="CHD", Percentage, .desc = FALSE), color = Diseases)) +
geom_point() +
ggtitle("Diseases sorted by Coronary Heart Disease Percentage") + ylab("")
-- For Example: Coronary Heart Disease (CHD) County Level Map
county.CHD<-cbind.data.frame(region=code,value=Health[,27]/Health[,7]*100)
county_choropleth(county.CHD,
title = "County Data, CHD",
legend = "CHD")
## Warning in super$initialize(map.df, user.df): Your data.frame contains the
## following regions which are not mappable: 2201, 2232, 2280
## Warning in self$bind(): The following regions were missing and are being
## set to NA: 31075, 31115, 2105, 49009, 48033, 48301, 2198, 15005, 31005,
## 48269, 2060, 2282, 48261, 8014, 30069, 31117, 8053, 8111, 2195, 2230, 2068,
## 2013, 2275, 16025